## This code and analyses were produced Jason Anastasopoulos (j.andronici@gmail.com)
## and Anthony Bertelli for the paper "Understanding Delegation Through Machine Learning: A Methods and Application to the EU"

## Here we train a series of gradient boosted tree classifiers on the hand coded delegation and constraint data,
## apply to classifier to the provisions, and calculate delegation and constraint ratios 
## Each of the trained classifiers are saved in a .RData file


library(pacman)


# This loads and installs the packages you need at once
pacman::p_load(tm,SnowballC,foreign,plyr,twitteR,slam,foreign,wordcloud,LiblineaR,e1071, topicmodels,readr,
               monkeylearn, EBglmnet, bayesreg, ggplot2,randomForest,
               glmnet, monomvn, caret, rpart, xgboost, boot,dplyr, ranger,
               xgboost,quanteda)


##############################################################################################################
##############################################################################################################
##############################################################################################################
######################### Training ###########################################################################
##############################################################################################################
##############################################################################################################
##############################################################################################################

data = "~/Training Data/Training_Data_COM.csv"

textdata<-read.csv(data)

provision.text = textdata$Text

# Let's create two categories of constraints
# Constraints to members states

#[10] "MS.Time.Limit"                    "MS.Reporting.Requirements"        "MS.Consultation.Requirements"    
#[13] "MS.Appeals.Procedures"            "MS.Executive.Action.Required"     "MS.Legislative.Action.Required"  
#[16] "MS.Spending.Limits"               "MS.Executive.Action.Possible"     "MS.Exemptions" 

# Need to train a classifier for each of these categories of constraints and then we need to report
# the statistics for each of these

constraints.com<-textdata[,21:32]
delegation.com<-textdata$Lable


data.constraints.COM<-data.frame(provision.text,constraints.com)
data.delegation.COM<-data.frame(provision.text,delegation.com)

# Use regular expressions to clean up some elements of the documents
cleandocs<-c()


# Now we have to put the training data and classification data into one matrix

provisiontext<-sapply(textdata$Text,as.character)

provisiontext = corpus(provisiontext) # Create a corpus object


token.dirty  = tokens(provisiontext,ngrams = 1:2)
token.clean = tokens_select(token.dirty, 
                            c("/","@", "\\|","#","http","https" ,".com","$", " g "),
                            selection ="remove")


dtm = dfm(token.clean, remove = stopwords("english"), 
              remove_punct = TRUE,stem = TRUE)

dtm  = dfm_trim(dtm,sparsity=.99)


rowTotals<-rowSums(as.matrix(dtm))
#rowTotals <- apply(dtm10 , 1, sum) #Find the sum of words in each Document


dtm.new   <- dtm[rowTotals> 0, ]           #remove all docs without words

data.constraints.COM.new<-data.constraints.COM[rowTotals> 0,]
data.delegation.COM.new<-data.delegation.COM[rowTotals> 0,]



############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
##################################    Classifier Training Performance Constraint ###########################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################

# We have to train 12 different classifiers for constraint, safe the classifiers as objects, report performance of the
# classifiers and word importance plots 

#"MS.Rulemaking.Requirements"      
#[10] "MS.Time.Limit"                    "MS.Reporting.Requirements"        "MS.Consultation.Requirements"    
#[13] "MS.Appeals.Procedures"            "MS.Executive.Action.Required"     "MS.Legislative.Action.Required"  
#[16] "MS.Spending.Limits"               "MS.Executive.Action.Possible"     "MS.Exemptions"                   
#[19] "MS.Public.Hearings"               "MS.Legislative.Action.Possible"


# Constraint names
#constraint.names.upper = names(data.constraints.MS.new[,2:13])
#constraint.names.lower = tolower(names(data.constraints.MS.new[,2:13]))
#constraint.names.mat = data.frame(rep(0,dim(data.constraints.MS.new)[1]))

#[1] "ms.rulemaking.requirements"     "ms.time.limit"                  "ms.reporting.requirements"     
#[4] "ms.consultation.requirements"   "ms.appeals.procedures"          "ms.executive.action.required"  
#[7] "ms.legislative.action.required" "ms.spending.limits"             "ms.executive.action.possible"  
#[10] "ms.exemptions"                  "ms.public.hearings"             "ms.legislative.action.possible"

# List of the constraints, delegation ratio is going to have to have a denominator of 6
com.rulemaking.requirements<- data.constraints.COM.new$Comm.Rulemaking.Requirements
com.rulemaking.requirements<- ifelse(as.numeric(com.rulemaking.requirements) >= 1, 1,0)

com.time.limit<-data.constraints.COM.new$Comm.Time.Limit
com.time.limit<-ifelse(com.time.limit >= 1, 1,0)

com.reporting.requirements<-data.constraints.COM.new$Comm.Reporting.Requirements
com.reporting.requirements<-ifelse(com.reporting.requirements >= 1, 1,0)

com.consultation.requirements<-data.constraints.COM.new$Comm.Consultation.Requirements
com.consultation.requirements<-ifelse(com.consultation.requirements >= 1, 1,0)

com.appeals.procedures<-data.constraints.COM.new$Comm.Appeals.Procedures # Note: there are NO positive cases here.
com.appeals.procedures<-ifelse(com.appeals.procedures >= 1, 1,0)

com.executive.action.required<-data.constraints.COM.new$Comm.Executive.Action.Required
com.executive.action.required<-ifelse(com.executive.action.required >= 1, 1,0) 

com.legislative.action.required<-data.constraints.COM.new$Comm.Legislative.Action.Required
com.legislative.action.required<-ifelse(com.legislative.action.required >= 1, 1,0) 

com.spending.limits<-data.constraints.COM.new$Comm.Spending.Limits
com.spending.limits<-ifelse(com.spending.limits >= 1, 1,0)

com.executive.action.possible<-data.constraints.COM.new$Comm.Executive.Action.Possible
com.executive.action.possible<-ifelse(com.executive.action.possible >= 1, 1,0) 

com.exemptions<-data.constraints.COM.new$Comm.Exemptions
com.exemptions<-ifelse(com.exemptions >= 1, 1,0) 

com.public.hearings<-data.constraints.COM.new$Comm.Public.Hearings
com.public.hearings<-ifelse(com.public.hearings >= 1, 1,0) 

com.legislative.action.possible<-data.constraints.COM.new$Comm.Legislative.Action.Possible
com.legislative.action.possible<-ifelse(com.legislative.action.possible >= 1, 1,0) 

# Let's put all of the constraints into a final matrix
constraint.label.mat = data.frame(
  com.rulemaking.requirements,
  com.time.limit,
  com.reporting.requirements,
  com.consultation.requirements,
  com.executive.action.required,
 # com.legislative.action.required, UNUSABLE
 # com.spending.limits, # unsuable
  com.executive.action.possible,
  #com.exemptions, # unusable
  com.public.hearings,
  com.legislative.action.possible
)

# Now we have to train 6 classifiers
classifiers = c()
constraint.performance.table = data.frame(c(0),c(0), c(0), c(0), c(0),c(0))
names(constraint.performance.table) = c("Constraint Type", "Accuracy", "Sensitivity", "Specificity", "F1","Precision")
classifiernames = c("xgb1","xgb2","xgb3","xgb4","xgb5","xgb6")

# Start loop here
for(i in 1:dim(constraint.label.mat)[2]){ 
  set.seed(42616)
  dtm_mat<-as.matrix(dtm.new)
  mllabel = data.frame(constraint.label.mat[,i],dtm_mat)
  train=sample(1:dim(mllabel)[1],
               dim(mllabel)[1]*0.7)
  dtm_mat<-as.matrix(dtm.new)
  trainX = dtm_mat[train,]
  testX = dtm_mat[-train,]
  trainY = constraint.label.mat[,i][train]
  testY = constraint.label.mat[,i][-train]

  traindata<-data.frame(trainY,trainX)
  testdata<-data.frame(testY,testX)
  
  #traindata.balance <- ovun.sample(trainY ~ ., data = traindata, 
  #                                 method = "over", p=0.5, 
   #                                seed = 1)$data
  
  #trainY = as.vector(traindata.balance$trainY)
  #trainX = as.matrix(traindata.balance[,2:dim(traindata.balance)[2]])

  traindata.b <- xgb.DMatrix(data = trainX,label = trainY) 
  testdata.b  <- xgb.DMatrix(data = testX,label=testY)

  pospredweight = as.vector(table(trainY)[1])/as.vector(table(trainY)[2])

  set.seed(100)

  # Parameter tuning
  # these are default parameters
  params <- list(booster = "gbtree", objective = "binary:logistic", 
               eta=0.3, gamma=0, max_depth=6, min_child_weight=1, 
               subsample=1, colsample_bytree=1)

  xgbcv<- xgb.cv( params = params, data = traindata.b,
                 nrounds = 200, nfold = 5, showsd = T, 
                 stratified = T, early.stopping.rounds = 20, print.every_n = 10,
                 maximize = F,
                 scale_pos_weight = pospredweight)
  
  ## Plot train and test error
  test.error = xgbcv$evaluation_log[,4]
  train.error = xgbcv$evaluation_log[,2]
  
  # Which number of iterations has the lowest training error?
  best.iter = which(xgbcv$evaluation_log$test_error_mean ==  min(xgbcv$evaluation_log$test_error_mean))
  best.iter = best.iter[1]

  #first default - model training
      xgb1 <- xgb.train(params = params, data = traindata.b, 
                  nrounds = best.iter, 
                  watchlist = list(val=testdata.b,train=traindata.b),
                  print.every_n = 10, early_stopping_rounds = 10, 
                  maximize = F , eval_metric = "error",
                  scale_pos_weight = pospredweight,
                  alpha=1)
  classifiers = c(assign(classifiernames[i],xgb1),classifiers)
  
  #model prediction
  xgbpred <- predict(xgb1,testdata.b)
  xgbpred <- ifelse(xgbpred > 0.5,1,0)

  cmat = confusionMatrix(factor(xgbpred), factor(testY),positive="1")
  
  F1 = round(as.numeric(cmat$byClass[7]),3)
  accuracy = round(as.numeric(cmat$overall[1]),3)
  sensitivity = round(as.numeric(cmat$byClass[1]),3)
  specificity = round(as.numeric(cmat$byClass[2]),3)
  constrainttype = toString(names(constraint.label.mat[i]))
  precision = round(as.numeric(cmat$byClass[3]),3)
  
  outvec = c(constrainttype, accuracy, sensitivity, specificity,F1,precision)
  constraint.performance.table = rbind(outvec, constraint.performance.table)
  
  # Produce a word importance plot for each category
  setwd("/Users/jason/Dropbox/Research/Papers/Delegation-ML-Project/Draft-EU-Paper/figs/APSR_RnR")
  
  mat <- xgb.importance(feature_names = colnames(trainX),model = xgb1)
  
  png(paste("term-importance-com",i,".png",sep=""))
  xgb.plot.importance(importance_matrix = mat[1:10],
                      xlab = "Information Gain",
                      ylab = "Term")
  dev.off()
  
  # Plot training and test error
  df = data.frame(
    Iteration = c(1:length(test.error$test_error_mean), 1:length(test.error$test_error_mean)),
    Error = c(test.error$test_error_mean,train.error$train_error_mean), 
    Type = c(rep("Test Error",length(test.error$test_error_mean)), 
             rep("Train Error",length(test.error$test_error_mean)))
  )
  
  ggplot(data=df, aes(x=Iteration, y=Error, colour=Type)) +
    geom_line()+
    geom_point() + theme_classic() +
    geom_vline(xintercept=best.iter,linetype="dotted") 
  ggsave(paste("traintest",i,".png",sep=""))
  
  # Store each trained classifier in the "Trained-Classifiers" directory
  classifiername = paste(constrainttype,"_classifier-com",".RData",sep="")
  directory = "~/Dropbox/Research/Papers/Delegation-ML-Project/Trained-Classifiers/APSR_RnR/COM-Classifiers/"
  save.image(
    paste(directory,classifiername,sep="")
  )
  
}

##### Now add classifiers for random forest,SVM and regularized linear regression #######



######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here



############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
##################################    Classifier Training Performance Delegation ###########################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
#### Need to incorporate codings here

delegation.COM = data.delegation.COM.new$delegation.com

dtm_mat<-as.matrix(dtm.new)
mllabel = data.frame(delegation.COM,dtm_mat)
train=sample(1:dim(mllabel)[1],
             dim(mllabel)[1]*0.95)
dtm_mat<-as.matrix(dtm.new)
trainX = dtm_mat[train,]
testX = dtm_mat[-train,]
trainY = delegation.COM[train]
testY = delegation.COM[-train]

traindata<-data.frame(trainY,trainX)
testdata<-data.frame(testY,testX)

traindata.b <- xgb.DMatrix(data = trainX,label = trainY) 
testdata.b  <- xgb.DMatrix(data = testX,label=testY)

pospredweight = as.vector(table(trainY)[1])/as.vector(table(trainY)[2])

set.seed(100)

# Parameter tuning
# these are default parameters
params <- list(booster = "gbtree", objective = "binary:logistic", 
               eta=0.3, gamma=0, max_depth=6, min_child_weight=1, 
               subsample=1, colsample_bytree=1)

xgbcv<- xgb.cv( params = params, data = traindata.b,
                nrounds = 500, nfold = 5, showsd = T, 
                stratified = T, early.stopping.rounds = 20, print.every_n = 10,
                maximize = F,
                scale_pos_weight = pospredweight)

## Plot train and test error
test.error = xgbcv$evaluation_log[,4]
train.error = xgbcv$evaluation_log[,2]

# Which number of iterations has the lowest training error?
best.iter = which(xgbcv$evaluation_log$test_error_mean ==  min(xgbcv$evaluation_log$test_error_mean))
best.iter = best.iter[1]

#first default - model training
xgb1 <- xgb.train(params = params, data = traindata.b, 
                  nrounds = best.iter, 
                  watchlist = list(val=testdata.b,train=traindata.b),
                  print.every_n = 10, early_stopping_rounds = 10, 
                  maximize = F , eval_metric = "error",
                  scale_pos_weight = pospredweight,
                  alpha=1)

#model prediction
xgbpred <- predict(xgb1,testdata.b)
xgbpred <- ifelse(xgbpred > 0.5,1,0)

cmat = confusionMatrix(factor(xgbpred), factor(testY),positive="1")

F1 = round(as.numeric(cmat$byClass[7]),3)
accuracy = round(as.numeric(cmat$overall[1]),3)
sensitivity = round(as.numeric(cmat$byClass[1]),3)
specificity = round(as.numeric(cmat$byClass[2]),3)
precision = round(as.numeric(cmat$byClass[3]),3)

outvec = c(constrainttype, accuracy, sensitivity, specificity,F1,precision)
constraint.performance.table = rbind(outvec, constraint.performance.table)

# Produce a word importance plot for each category
setwd("/Users/jason/Dropbox/Research/Papers/Delegation-ML-Project/Draft-EU-Paper/figs/APSR_RnR")

mat <- xgb.importance(feature_names = colnames(trainX),model = xgb1)

png(paste("term-importance","delegation-com.png",sep=""))
xgb.plot.importance(importance_matrix = mat[1:10],
                    xlab = "Information Gain",
                    ylab = " ",cex = 1.5)
dev.off()

# Plot training and test error
df = data.frame(
  Iteration = c(1:length(test.error$test_error_mean), 1:length(test.error$test_error_mean)),
  Error = c(test.error$test_error_mean,train.error$train_error_mean), 
  Type = c(rep("Test Error",length(test.error$test_error_mean)), 
           rep("Train Error",length(test.error$test_error_mean)))
)

ggplot(data=df, aes(x=Iteration, y=Error, colour=Type)) +
  geom_line()+
  geom_point() + theme_classic() +
  geom_vline(xintercept=best.iter,linetype="dotted") 
ggsave(paste("traintest","delegation.png",sep=""))


# Store each trained classifier in the "Trained-Classifiers" directory
save.image("/Users/jason/Dropbox/Research/Papers/Delegation-ML-Project/Trained-Classifiers/APSR_RnR/COM-Classifiers/delegation-com-classifier.RData")

######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here

write.csv(constraint.performance.table,
          "/Users/jason/Dropbox/Research/Papers/Delegation-ML-Project/FINAL-OUTPUT-DATA/APSR_RnR/Final-Tables/COM-Performance-Final-Plus.csv")

######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here


















